# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
!{sys.executable} -m pip install numpy plotly
Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.8/site-packages (1.19.2) Requirement already satisfied: plotly in /opt/anaconda3/lib/python3.8/site-packages (4.14.3) Requirement already satisfied: six in /opt/anaconda3/lib/python3.8/site-packages (from plotly) (1.15.0) Requirement already satisfied: retrying>=1.3.3 in /opt/anaconda3/lib/python3.8/site-packages (from plotly) (1.3.3)
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
df=pd.read_csv('prosperLoanData.csv')
df.head()
| ListingKey | ListingNumber | ListingCreationDate | CreditGrade | Term | LoanStatus | ClosedDate | BorrowerAPR | BorrowerRate | LenderYield | ... | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1021339766868145413AB3B | 193129 | 2007-08-26 19:09:29.263000000 | C | 36 | Completed | 2009-08-14 00:00:00 | 0.16516 | 0.1580 | 0.1380 | ... | -133.18 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 258 |
| 1 | 10273602499503308B223C1 | 1209647 | 2014-02-27 08:28:07.900000000 | NaN | 36 | Current | NaN | 0.12016 | 0.0920 | 0.0820 | ... | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 2 | 0EE9337825851032864889A | 81716 | 2007-01-05 15:00:47.090000000 | HR | 36 | Completed | 2009-12-17 00:00:00 | 0.28269 | 0.2750 | 0.2400 | ... | -24.20 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 41 |
| 3 | 0EF5356002482715299901A | 658116 | 2012-10-22 11:02:35.010000000 | NaN | 36 | Current | NaN | 0.12528 | 0.0974 | 0.0874 | ... | -108.01 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 158 |
| 4 | 0F023589499656230C5E3E2 | 909464 | 2013-09-14 18:38:39.097000000 | NaN | 36 | Current | NaN | 0.24614 | 0.2085 | 0.1985 | ... | -60.27 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 20 |
5 rows Ć 81 columns
print(df.keys())
print(len(df.keys()))
Index(['ListingKey', 'ListingNumber', 'ListingCreationDate', 'CreditGrade',
'Term', 'LoanStatus', 'ClosedDate', 'BorrowerAPR', 'BorrowerRate',
'LenderYield', 'EstimatedEffectiveYield', 'EstimatedLoss',
'EstimatedReturn', 'ProsperRating (numeric)', 'ProsperRating (Alpha)',
'ProsperScore', 'ListingCategory (numeric)', 'BorrowerState',
'Occupation', 'EmploymentStatus', 'EmploymentStatusDuration',
'IsBorrowerHomeowner', 'CurrentlyInGroup', 'GroupKey',
'DateCreditPulled', 'CreditScoreRangeLower', 'CreditScoreRangeUpper',
'FirstRecordedCreditLine', 'CurrentCreditLines', 'OpenCreditLines',
'TotalCreditLinespast7years', 'OpenRevolvingAccounts',
'OpenRevolvingMonthlyPayment', 'InquiriesLast6Months', 'TotalInquiries',
'CurrentDelinquencies', 'AmountDelinquent', 'DelinquenciesLast7Years',
'PublicRecordsLast10Years', 'PublicRecordsLast12Months',
'RevolvingCreditBalance', 'BankcardUtilization',
'AvailableBankcardCredit', 'TotalTrades',
'TradesNeverDelinquent (percentage)', 'TradesOpenedLast6Months',
'DebtToIncomeRatio', 'IncomeRange', 'IncomeVerifiable',
'StatedMonthlyIncome', 'LoanKey', 'TotalProsperLoans',
'TotalProsperPaymentsBilled', 'OnTimeProsperPayments',
'ProsperPaymentsLessThanOneMonthLate',
'ProsperPaymentsOneMonthPlusLate', 'ProsperPrincipalBorrowed',
'ProsperPrincipalOutstanding', 'ScorexChangeAtTimeOfListing',
'LoanCurrentDaysDelinquent', 'LoanFirstDefaultedCycleNumber',
'LoanMonthsSinceOrigination', 'LoanNumber', 'LoanOriginalAmount',
'LoanOriginationDate', 'LoanOriginationQuarter', 'MemberKey',
'MonthlyLoanPayment', 'LP_CustomerPayments',
'LP_CustomerPrincipalPayments', 'LP_InterestandFees', 'LP_ServiceFees',
'LP_CollectionFees', 'LP_GrossPrincipalLoss', 'LP_NetPrincipalLoss',
'LP_NonPrincipalRecoverypayments', 'PercentFunded', 'Recommendations',
'InvestmentFromFriendsCount', 'InvestmentFromFriendsAmount',
'Investors'],
dtype='object')
81
df.info()
#Following columns' datatype are not correct.
#ListingCreationDate
#ClosedDate
#DateCreditPulled
#LoanOriginationDate
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingKey 113937 non-null object 1 ListingNumber 113937 non-null int64 2 ListingCreationDate 113937 non-null object 3 CreditGrade 28953 non-null object 4 Term 113937 non-null int64 5 LoanStatus 113937 non-null object 6 ClosedDate 55089 non-null object 7 BorrowerAPR 113912 non-null float64 8 BorrowerRate 113937 non-null float64 9 LenderYield 113937 non-null float64 10 EstimatedEffectiveYield 84853 non-null float64 11 EstimatedLoss 84853 non-null float64 12 EstimatedReturn 84853 non-null float64 13 ProsperRating (numeric) 84853 non-null float64 14 ProsperRating (Alpha) 84853 non-null object 15 ProsperScore 84853 non-null float64 16 ListingCategory (numeric) 113937 non-null int64 17 BorrowerState 108422 non-null object 18 Occupation 110349 non-null object 19 EmploymentStatus 111682 non-null object 20 EmploymentStatusDuration 106312 non-null float64 21 IsBorrowerHomeowner 113937 non-null bool 22 CurrentlyInGroup 113937 non-null bool 23 GroupKey 13341 non-null object 24 DateCreditPulled 113937 non-null object 25 CreditScoreRangeLower 113346 non-null float64 26 CreditScoreRangeUpper 113346 non-null float64 27 FirstRecordedCreditLine 113240 non-null object 28 CurrentCreditLines 106333 non-null float64 29 OpenCreditLines 106333 non-null float64 30 TotalCreditLinespast7years 113240 non-null float64 31 OpenRevolvingAccounts 113937 non-null int64 32 OpenRevolvingMonthlyPayment 113937 non-null float64 33 InquiriesLast6Months 113240 non-null float64 34 TotalInquiries 112778 non-null float64 35 CurrentDelinquencies 113240 non-null float64 36 AmountDelinquent 106315 non-null float64 37 DelinquenciesLast7Years 112947 non-null float64 38 PublicRecordsLast10Years 113240 non-null float64 39 PublicRecordsLast12Months 106333 non-null float64 40 RevolvingCreditBalance 106333 non-null float64 41 BankcardUtilization 106333 non-null float64 42 AvailableBankcardCredit 106393 non-null float64 43 TotalTrades 106393 non-null float64 44 TradesNeverDelinquent (percentage) 106393 non-null float64 45 TradesOpenedLast6Months 106393 non-null float64 46 DebtToIncomeRatio 105383 non-null float64 47 IncomeRange 113937 non-null object 48 IncomeVerifiable 113937 non-null bool 49 StatedMonthlyIncome 113937 non-null float64 50 LoanKey 113937 non-null object 51 TotalProsperLoans 22085 non-null float64 52 TotalProsperPaymentsBilled 22085 non-null float64 53 OnTimeProsperPayments 22085 non-null float64 54 ProsperPaymentsLessThanOneMonthLate 22085 non-null float64 55 ProsperPaymentsOneMonthPlusLate 22085 non-null float64 56 ProsperPrincipalBorrowed 22085 non-null float64 57 ProsperPrincipalOutstanding 22085 non-null float64 58 ScorexChangeAtTimeOfListing 18928 non-null float64 59 LoanCurrentDaysDelinquent 113937 non-null int64 60 LoanFirstDefaultedCycleNumber 16952 non-null float64 61 LoanMonthsSinceOrigination 113937 non-null int64 62 LoanNumber 113937 non-null int64 63 LoanOriginalAmount 113937 non-null int64 64 LoanOriginationDate 113937 non-null object 65 LoanOriginationQuarter 113937 non-null object 66 MemberKey 113937 non-null object 67 MonthlyLoanPayment 113937 non-null float64 68 LP_CustomerPayments 113937 non-null float64 69 LP_CustomerPrincipalPayments 113937 non-null float64 70 LP_InterestandFees 113937 non-null float64 71 LP_ServiceFees 113937 non-null float64 72 LP_CollectionFees 113937 non-null float64 73 LP_GrossPrincipalLoss 113937 non-null float64 74 LP_NetPrincipalLoss 113937 non-null float64 75 LP_NonPrincipalRecoverypayments 113937 non-null float64 76 PercentFunded 113937 non-null float64 77 Recommendations 113937 non-null int64 78 InvestmentFromFriendsCount 113937 non-null int64 79 InvestmentFromFriendsAmount 113937 non-null float64 80 Investors 113937 non-null int64 dtypes: bool(3), float64(50), int64(11), object(17) memory usage: 68.1+ MB
df = df[['CreditGrade','BorrowerRate','BorrowerState','LoanOriginalAmount','IsBorrowerHomeowner','TotalCreditLinespast7years','DelinquenciesLast7Years']]
print(len(df))
print(len(df.drop_duplicates()))
113937 111906
81 columns, 113,937 rows, no duplicate value, some of columns needs wrangle.
I am going to draw 9 pictures in Univariate,bivariate and multivariate chapter.
For univariate, I am going to find some characteristics of dataset, including distribution of credit rank, interest rate and borrower's location on the map.
For bivariate visualisation, I am going to dig deeper based on the result above. Such as the different rank's interest rate, the relationship between loan price and interest rate and average loan USD in different states.
For multivariate visualisation, I am going to add more dimension on bivariate visualisation result.
Credit grade, borrower rate, borrower state, loan original amount, is borrower home owner, total credit lines past 7 years, delinquencies last 7 years.
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
CreditGrade = df[['CreditGrade']].dropna().value_counts()
CreditGrade.index
MultiIndex([( 'C',),
( 'D',),
( 'B',),
('AA',),
('HR',),
( 'A',),
( 'E',),
('NC',)],
names=['CreditGrade'])
index=['C','D','B','AA','HR','A','E','NC']
plt.figure(figsize=[20,10])
plt.pie(CreditGrade,labels=index,startangle=90,counterclock=False,wedgeprops = {'width' : 0.4})
plt.axis('square')
plt.title("Borrower's distribution of their credit grade")
#Here, we found the distribution of different credit rank. However, from the pictures below we cannot find very
#obvious information. If we add more variable, we may find some valuable information from mass dataset.
Text(0.5, 1.0, "Borrower's distribution of their credit grade")
plt.figure(figsize=[20,10])
plt.hist(df['BorrowerRate'])
plt.title("Histogram of borrower's interest rate")
plt.xlabel("Interest Rate")
plt.ylabel("Listing Counts")
#From the picture below we know that most of borrower's rate is between 10% to 20%.
#For the next step, we can try to analyse the characteristics of extremely low(lower than 5%) and high(higher than 35%).
#Do they have higher/lower credit rank? Where are they living? What is their occupation?
#We may find the reason why they can borrow money cheaper/more expensive.
Text(0, 0.5, 'Listing Counts')
import plotly.graph_objects as go
states = pd.DataFrame(df['BorrowerState'].value_counts()).reset_index()
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
fig = go.Figure(data=go.Choropleth(
colorscale=scl,
autocolorscale=False,
locations=states['index'],
z=states['BorrowerState'],
locationmode='USA-states',
colorbar_title = "Listing counts"))
fig.update_layout(title="Distribution of borrower in the US nationalwide.",
geo_scope='usa')
fig.show()
#The first impression of the visualisation below are brilliant. However, I found the listing distribution is
#perfectly match the population of the United States. It weaken the result of the map below.
#So, if I am going to dive deeper, I am going to use some other variables to find the relationship between
#the listing in different states.
Make sure that, after every plot or related series of plots, that you include a Markdown cell with comments about what you observed, and what you plan on investigating next.
There are some unusual points in interests, we can see some incredible low and high interests in the distribution histogram. We may find more about what kind of people can borrow money with cheaper/more expensive price.
Firstly, We found there are not enough information in the first pie chart. Credit rank information needs to combine with other variables to find useful information. For geographic information, it gives some useful information, but those information can be reasoning from people's common sense. If we combine this and some other variables, it may gives us better insight(such as average loan, interests, etc.)
In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
CreditInterests=df[['CreditGrade','BorrowerRate']].dropna()
plt.figure(figsize=[20,10])
sb.violinplot(data=CreditInterests,x='CreditGrade',y='BorrowerRate')
plt.title("The relationship between borrower rate and credit grade")
# Here, combine the borrower rate and credit rate, we can find the relationship between borrower rate and their credit.
# In the most of situations, if people have better credit grade, they can borrow money with cheaper price.
# such as people with AA grade's obvious peak is lower than 0.1, which means less than 10 per cent.
# But in grade A cluster, the peak of violin is settled higher than 0.1, approximately 11-12 per cent.
# In the next chapter I am going to use facet grid to dive deeper based on the result we have to find more information
# group by credit grade.
Text(0.5, 1.0, 'The relationship between borrower rate and credit grade')
LoanAmountInterest=df[['BorrowerRate','LoanOriginalAmount']]
plt.figure(figsize=[20,10])
plt.scatter(data=LoanAmountInterest,x='BorrowerRate',y='LoanOriginalAmount',alpha=0.2)
plt.title("The relationship between borrower rate and loan original amount")
# From the scatter plot below, we can see approximate shape of the dots are triangle skewed to the left.
# This means for those people can borrow money in cheaper price, in most of situation they can borrow more money
# than people in higher interest rate cluster.
# Another interesting result is from the chart, most of people choose to borrow whole number of money.
# We can see some clear line in 10000, 15000, 20000, 25000 and 30000.
# We are going to find the difference between estate owner and non estate owner's loan status in the next part.
Text(0.5, 1.0, 'The relationship between borrower rate and loan original amount')
AvgLoanPerState=df[['BorrowerState','LoanOriginalAmount']].groupby('BorrowerState').mean().reset_index()
fig = go.Figure(data=go.Choropleth(
colorscale='blues',
autocolorscale=False,
locations=AvgLoanPerState['BorrowerState'],
z=AvgLoanPerState['LoanOriginalAmount'],
locationmode='USA-states',
colorbar_title = "Average loan(USD)"))
fig.update_layout(title="Distribution of loan original amount in the US nationalwide.",
geo_scope='usa')
fig.show()
# Comparing with the the first choropleth, this one gives us more information.
# We can see east/west coast's average loan is higher than the rest of country,
# and there are some clusters on the map, such as new England and California-Nevada has higher average loan,
# and bible belt have lower average loan. However, there are three special outliers on the map.
# They are North Dakota, Iowa and Maine. their average loan is extremely lower than the most of states in the US.
# Maybe we can add more variables to find more information about the difference between each states, and luckily
# we may find the reason of those outlier.
Your answer here!
Your answer here!
Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
LoanAndInterest = df[['CreditGrade','BorrowerRate','LoanOriginalAmount']].dropna().reset_index()
g = sb.FacetGrid(data=LoanAndInterest, col='CreditGrade',height=3,col_wrap=4)
g.map(plt.scatter, 'BorrowerRate','LoanOriginalAmount',alpha=0.25)
# plt.title('The relationship between loan original amount and borrower rate in different credit rate.')
# The following result shows in different credit grade's people's loan status.
# Overall, the shape of dot cluster looks like a square.
# The difference between high credit grade's square and lower's one is the square's location and shape.
# People with higher credit grade such as AA and A's shape is very tall and thin, which means they are very easy to
# borrow a lot of money in very low interest. and the situation begin to change from grade B. the right side of
# the square begin to move far right, and the top of square's dot are more and more sparse. This means for those
# people in lower credit grade, even their interest are higher, they cannot borrow too much money.
# For people with low and unknown credit record such as HR, NC and E, it is almost impossible to borrow money more
# than 10000.
<seaborn.axisgrid.FacetGrid at 0x7f841045d5b0>
HomeOwner=[]
for each in df[['IsBorrowerHomeowner']]['IsBorrowerHomeowner']:
if each == True:
HomeOwner.append('Yes')
else:
HomeOwner.append('No')
HomeAndInterest = df[['BorrowerRate','LoanOriginalAmount']]
HomeAndInterest['IsBorrowerHomeowner'] = HomeOwner
HomeAndInterest.head()
g = sb.FacetGrid(data=HomeAndInterest, hue='IsBorrowerHomeowner', height=10)
g.map(plt.scatter, 'BorrowerRate', 'LoanOriginalAmount',alpha=0.25)
g.add_legend()
plt.title("Comparison of home owner and no home owner's loan status.")
# Compare with the The relationship between borrower rate and loan original amount, it is obvious that people with
# real estate are more convenient to borrow money from financial department. We can see people is getting harder and
# harder to borrow more than 10000 money if they have no real estate.
<ipython-input-16-31acec015697>:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Text(0.5, 1.0, "Comparison of home owner and no home owner's loan status.")
BadDebtRate = df[['BorrowerState','TotalCreditLinespast7years','DelinquenciesLast7Years']]
BadDebtRate['BadDebtRate'] = df['DelinquenciesLast7Years']/df['TotalCreditLinespast7years']
BadDebtRate = BadDebtRate[['BorrowerState','BadDebtRate']]
BadDebtRate = BadDebtRate.groupby('BorrowerState').mean().reset_index()
fig = go.Figure(data=go.Choropleth(
colorscale='reds',
autocolorscale=False,
locations=BadDebtRate['BorrowerState'],
z=BadDebtRate['BadDebtRate'],
locationmode='USA-states',
colorbar_title = "Average Bad Debt Rate"))
fig.update_layout(title="Average Bad Debt Rate in Different States",
geo_scope='usa')
fig.show()
# Although east/west coast has the highest average loan, their bad debt rate is not very high when comparing with
# middle part of the United States. Another interesting result from North Dakota, Iowa and Maineās low average loan
# is, their average bad debt rate is obviously top 3 nationwide. On the countrary, Wyoming, which is the lowest
# bad debt rate, its average loan is higher than its neighborhood.
<ipython-input-17-df4b03c20624>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
If a person has better credit rank, this means he or she can borrow money with lower interests. People with low interests have to loan less money in higher interest rates. Ownership of a property can also helps. People without property almost cannot borrow money more than 25000 in low interest rate.
I was trying to find the relationship between geographic location and loan status. We can find some interesting result. East/west coast people loan more money than the rest of country, but their credit rank and average bad debt rate are not that low. And we can see some regular patterns on the map, such as some states have very serious bad debt situation and their average loan price is also lower than their neighborhood. We cannot say which one comes first, but we know they must have some relationship.
At the end of your report, make sure that you export the notebook as an html file from the
File > Download as... > HTMLmenu. Make sure you keep track of where the exported file goes, so you can put it in the same folder as this notebook for project submission. Also, make sure you remove all of the quote-formatted guide notes like this one before you finish your report!